ephrin_binding.ipynb¶
Analyze receptor selection data using soluble bat Ephrin-B2 or -B3
- Written by Brendan Larsen
In [1]:
# this cell is tagged as parameters for `papermill` parameterization
# input configs
altair_config = None
nipah_config = None
# input files
binding_E2_file = None
binding_E3_file = None
# output images
entry_binding_combined_corr_plot = None
entry_binding_combined_corr_plot_agg = None
E2_E3_correlation = None
E2_E3_correlation_site = None
combined_E2_E3_site_corr = None
binding_by_site_plot = None
entry_binding_corr_heatmap = None
binding_corr_heatmap = None
binding_region_boxplot_plot = None
binding_region_bubble_plot = None
max_binding_in_stalk = None
max_binding_in_contact = None
combined_contact_ranked_bar_output = None
In [2]:
# Parameters
nipah_config = "nipah_config.yaml"
altair_config = "data/custom_analyses_data/theme.py"
binding_E2_file = "results/filtered_data/binding/e2_binding_filtered.csv"
binding_E3_file = "results/filtered_data/binding/e3_binding_filtered.csv"
entry_binding_combined_corr_plot = (
"results/images/entry_binding_combined_corr_plot.html"
)
entry_binding_combined_corr_plot_agg = (
"results/images/entry_binding_combined_corr_plot_agg.html"
)
E2_E3_correlation = "results/images/E2_E3_correlation.html"
E2_E3_correlation_site = "results/images/E2_E3_correlation_site.html"
combined_E2_E3_site_corr = "results/images/combined_E2_E3_site_corr.html"
binding_by_site_plot = "results/images/binding_by_site_plot.html"
entry_binding_corr_heatmap = "results/images/entry_binding_corr_heatmap.html"
binding_corr_heatmap = "results/images/binding_corr_heatmap.html"
binding_region_boxplot_plot = "results/images/binding_region_boxplot_plot.html"
binding_region_bubble_plot = "results/images/binding_region_bubble_plot.html"
max_binding_in_contact = "results/images/max_binding_in_contact.html"
max_binding_in_stalk = "results/images/max_binding_in_stalk.html"
combined_contact_ranked_bar_output = (
"results/images/combined_contact_ranked_bar_output.html"
)
In [3]:
import math
import os
import re
import altair as alt
import numpy as np
import pandas as pd
import scipy.stats
import yaml
In [4]:
# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()
if (
os.getcwd()
== "/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/"
):
pass
print("Already in correct directory")
else:
os.chdir(
"/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/"
)
print("Setup in correct directory")
Setup in correct directory
hard paths for running in interactive mode¶
In [5]:
if nipah_config is None:
##hard paths in case don't want to run with snakemake
print("loading hard paths")
altair_config = "data/custom_analyses_data/theme.py"
nipah_config = "nipah_config.yaml"
# input files
binding_E2_file = "results/filtered_data/binding/e2_binding_filtered.csv"
binding_E3_file = "results/filtered_data/binding/e3_binding_filtered.csv"
Run config files to setup altair theme and config variables¶
In [6]:
if altair_config:
with open(altair_config, "r") as file:
exec(file.read())
with open(nipah_config) as f:
config = yaml.safe_load(f)
Import the filtered binding and entry data for bEFNB2 and bEFNB3¶
In [7]:
# import binding data
df_E2_filter = pd.read_csv(binding_E2_file)
display(df_E2_filter.head(3))
df_E3_filter = pd.read_csv(binding_E3_file)
display(df_E3_filter.head(3))
| site | wildtype | mutant | binding_mean | binding_std | times_seen_binding | effect | effect_std | times_seen_cell_entry | frac_models | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 71 | Q | D | -0.7817 | 1.6460 | 3.25 | -1.164 | 0.8890 | 4.500 | 1.0 |
| 1 | 71 | Q | E | 0.1659 | 0.4309 | 3.50 | -1.255 | 0.3123 | 5.375 | 1.0 |
| 2 | 71 | Q | F | -0.3429 | 0.5299 | 3.00 | -1.058 | 0.6637 | 4.625 | 1.0 |
| site | wildtype | mutant | binding_mean | binding_std | times_seen_binding | effect | effect_std | times_seen_cell_entry | frac_models | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 71 | Q | C | -0.1332 | 0.1525 | 3.0 | -0.7227 | 0.7828 | 3.000 | 1.0 |
| 1 | 71 | Q | D | -0.2937 | 0.3305 | 4.0 | -0.3884 | 0.6369 | 3.429 | 1.0 |
| 2 | 71 | Q | E | -0.3527 | 0.1762 | 4.0 | -0.2482 | 0.9791 | 4.571 | 1.0 |
Merge data¶
In [8]:
## Merge the filtered EFNB2 and EFNB3 DataFrames for combined analysis.
df_binding_effect_merge = pd.merge(
df_E2_filter,
df_E3_filter,
on=["site", "wildtype", "mutant"],
suffixes=["_E2", "_E3"],
how="outer",
)
display(df_binding_effect_merge.head(3))
## Add a 'selection' column to distinguish between EFNB2 and EFNB3 data.
df_E2_filter["selection"] = "bEFNB2"
df_E3_filter["selection"] = "bEFNB3"
## Concatenate the DataFrames for plotting or further analysis.
df_binding_effect_concat = pd.concat([df_E2_filter, df_E3_filter])
display(df_binding_effect_concat.head(3))
| site | wildtype | mutant | binding_mean_E2 | binding_std_E2 | times_seen_binding_E2 | effect_E2 | effect_std_E2 | times_seen_cell_entry_E2 | frac_models_E2 | binding_mean_E3 | binding_std_E3 | times_seen_binding_E3 | effect_E3 | effect_std_E3 | times_seen_cell_entry_E3 | frac_models_E3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 71 | Q | D | -0.7817 | 1.6460 | 3.25 | -1.164 | 0.8890 | 4.500 | 1.0 | -0.29370 | 0.3305 | 4.0 | -0.3884 | 0.6369 | 3.429 | 1.0 |
| 1 | 71 | Q | E | 0.1659 | 0.4309 | 3.50 | -1.255 | 0.3123 | 5.375 | 1.0 | -0.35270 | 0.1762 | 4.0 | -0.2482 | 0.9791 | 4.571 | 1.0 |
| 2 | 71 | Q | F | -0.3429 | 0.5299 | 3.00 | -1.058 | 0.6637 | 4.625 | 1.0 | -0.03982 | 0.1283 | 3.0 | -0.4973 | 0.3080 | 3.286 | 1.0 |
| site | wildtype | mutant | binding_mean | binding_std | times_seen_binding | effect | effect_std | times_seen_cell_entry | frac_models | selection | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 71 | Q | D | -0.7817 | 1.6460 | 3.25 | -1.164 | 0.8890 | 4.500 | 1.0 | bEFNB2 |
| 1 | 71 | Q | E | 0.1659 | 0.4309 | 3.50 | -1.255 | 0.3123 | 5.375 | 1.0 | bEFNB2 |
| 2 | 71 | Q | F | -0.3429 | 0.5299 | 3.00 | -1.058 | 0.6637 | 4.625 | 1.0 | bEFNB2 |
In [9]:
# What are the top 5 highest and lowest binding mutants for EFNB2 and EFNB3?
def find_highest_lowest(df, name):
print(f"We are analyzing {name}\n")
print(f"The total number of mutants was: {df.shape[0]}\n")
tmp_df = df.sort_values(by="binding_mean")
print("These are the lowest binding mutants detected:")
display(tmp_df.head(5))
tmp_df_high = df.sort_values(by="binding_mean", ascending=False)
print("\nThese are the highest binding mutants detected:\n")
display(tmp_df_high.head(5))
# What about mutants with positive entry scores?
tmp_df = df[df["effect"] > 0].sort_values(by="binding_mean")
print("These are the lowest binding mutants detected with positive entry scores:")
display(tmp_df.head(5))
tmp_df_high = df[df["effect"] > 0].sort_values(by="binding_mean", ascending=False)
print(
"\nThese are the highest binding mutants detected with positive entry scores:\n"
)
display(tmp_df_high.head(5))
mean_df = df.groupby('site')['binding_mean'].sum().reset_index()
display(mean_df.sort_values(by='binding_mean',ascending=False).head(10))
find_highest_lowest(df_E2_filter, "bEFNB2")
find_highest_lowest(df_E3_filter, "bEFNB3")
We are analyzing bEFNB2 The total number of mutants was: 6804 These are the lowest binding mutants detected:
| site | wildtype | mutant | binding_mean | binding_std | times_seen_binding | effect | effect_std | times_seen_cell_entry | frac_models | selection | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 5484 | 501 | E | K | -5.280 | 0.9284 | 24.50 | -0.6440 | 0.6061 | 32.00 | 1.0 | bEFNB2 |
| 5861 | 532 | A | V | -5.259 | 1.1400 | 16.25 | -0.3199 | 0.3122 | 17.88 | 1.0 | bEFNB2 |
| 1817 | 236 | R | H | -5.054 | 0.8828 | 22.00 | -1.2530 | 0.3352 | 29.38 | 1.0 | bEFNB2 |
| 5354 | 490 | Q | K | -4.966 | 1.0810 | 8.50 | -0.6916 | 0.5391 | 11.38 | 1.0 | bEFNB2 |
| 5352 | 490 | Q | H | -4.752 | 0.8619 | 8.25 | -0.4499 | 0.6679 | 10.88 | 1.0 | bEFNB2 |
These are the highest binding mutants detected:
| site | wildtype | mutant | binding_mean | binding_std | times_seen_binding | effect | effect_std | times_seen_cell_entry | frac_models | selection | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 6308 | 566 | F | C | 2.160 | 2.1190 | 5.00 | -0.5326 | 0.4547 | 5.500 | 0.75 | bEFNB2 |
| 6491 | 580 | I | S | 2.152 | 1.7680 | 4.75 | -1.0010 | 0.6921 | 6.375 | 1.00 | bEFNB2 |
| 3054 | 331 | N | E | 2.144 | 1.2330 | 3.00 | -0.8872 | 0.8436 | 3.500 | 1.00 | bEFNB2 |
| 2111 | 269 | N | E | 2.075 | 2.2410 | 5.00 | -1.0480 | 0.4835 | 7.875 | 1.00 | bEFNB2 |
| 6596 | 586 | N | T | 1.962 | 0.5779 | 5.25 | -0.8163 | 0.8231 | 7.000 | 1.00 | bEFNB2 |
These are the lowest binding mutants detected with positive entry scores:
| site | wildtype | mutant | binding_mean | binding_std | times_seen_binding | effect | effect_std | times_seen_cell_entry | frac_models | selection | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 6225 | 558 | A | V | -4.647 | 0.7295 | 17.25 | 0.1182 | 0.3640 | 18.50 | 1.0 | bEFNB2 |
| 6211 | 557 | N | D | -4.627 | 0.3989 | 9.50 | 0.3848 | 0.4791 | 11.12 | 1.0 | bEFNB2 |
| 5771 | 526 | L | H | -4.546 | 0.7822 | 5.25 | 0.1102 | 0.3920 | 6.75 | 1.0 | bEFNB2 |
| 6510 | 581 | Y | W | -4.519 | 1.0130 | 10.75 | 0.0907 | 0.4101 | 11.50 | 1.0 | bEFNB2 |
| 5367 | 491 | S | H | -4.505 | 0.9350 | 8.50 | 0.1124 | 0.2502 | 10.25 | 1.0 | bEFNB2 |
These are the highest binding mutants detected with positive entry scores:
| site | wildtype | mutant | binding_mean | binding_std | times_seen_binding | effect | effect_std | times_seen_cell_entry | frac_models | selection | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 2334 | 282 | C | S | 1.609 | 1.056 | 4.50 | 0.3378 | 0.3251 | 3.714 | 0.50 | bEFNB2 |
| 4833 | 450 | Q | I | 1.457 | 1.194 | 5.00 | 0.1418 | 0.3988 | 6.000 | 1.00 | bEFNB2 |
| 3193 | 339 | S | A | 1.389 | 2.087 | 2.00 | 0.4493 | 0.1488 | 2.625 | 0.75 | bEFNB2 |
| 3522 | 361 | T | A | 1.371 | 2.108 | 3.50 | 0.3626 | 0.4499 | 4.875 | 1.00 | bEFNB2 |
| 817 | 164 | N | S | 1.297 | 1.136 | 3.75 | 0.2398 | 0.2906 | 6.125 | 1.00 | bEFNB2 |
| site | binding_mean | |
|---|---|---|
| 250 | 331 | 15.73280 |
| 473 | 564 | 15.08690 |
| 97 | 172 | 13.88370 |
| 225 | 306 | 13.85177 |
| 493 | 586 | 13.36944 |
| 103 | 178 | 9.99930 |
| 151 | 227 | 9.97030 |
| 90 | 165 | 8.74900 |
| 239 | 320 | 8.57420 |
| 496 | 589 | 8.56552 |
We are analyzing bEFNB3 The total number of mutants was: 6618 These are the lowest binding mutants detected:
| site | wildtype | mutant | binding_mean | binding_std | times_seen_binding | effect | effect_std | times_seen_cell_entry | frac_models | selection | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 6124 | 555 | D | K | -2.147 | 1.2250 | 6.0 | 0.08504 | 0.4221 | 6.714 | 1.0 | bEFNB3 |
| 6127 | 555 | D | R | -1.510 | 0.4205 | 4.0 | 0.07591 | 0.5260 | 4.143 | 1.0 | bEFNB3 |
| 5810 | 530 | Q | L | -1.447 | 0.4772 | 3.5 | -1.01900 | 0.8623 | 5.857 | 1.0 | bEFNB3 |
| 6413 | 583 | T | R | -1.410 | 0.4465 | 5.5 | -1.00500 | 0.4854 | 6.286 | 1.0 | bEFNB3 |
| 6410 | 583 | T | K | -1.247 | 0.1132 | 5.0 | -0.72780 | 0.6441 | 7.000 | 1.0 | bEFNB3 |
These are the highest binding mutants detected:
| site | wildtype | mutant | binding_mean | binding_std | times_seen_binding | effect | effect_std | times_seen_cell_entry | frac_models | selection | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 6460 | 589 | R | G | 2.006 | 0.65910 | 7.0 | -0.9655 | 0.5763 | 8.286 | 1.0 | bEFNB3 |
| 6379 | 580 | I | M | 1.337 | 0.53380 | 5.5 | -0.8155 | 0.5307 | 6.714 | 1.0 | bEFNB3 |
| 2361 | 270 | V | Q | 1.329 | 1.02800 | 5.0 | -0.8846 | 0.6509 | 5.429 | 1.0 | bEFNB3 |
| 2313 | 267 | M | Q | 1.291 | 1.37100 | 4.5 | -0.7981 | 0.8175 | 5.667 | 1.0 | bEFNB3 |
| 5462 | 492 | Q | L | 1.261 | 0.06666 | 5.5 | 0.4833 | 0.1827 | 5.143 | 1.0 | bEFNB3 |
These are the lowest binding mutants detected with positive entry scores:
| site | wildtype | mutant | binding_mean | binding_std | times_seen_binding | effect | effect_std | times_seen_cell_entry | frac_models | selection | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 6124 | 555 | D | K | -2.1470 | 1.2250 | 6.0 | 0.08504 | 0.42210 | 6.714 | 1.0 | bEFNB3 |
| 6127 | 555 | D | R | -1.5100 | 0.4205 | 4.0 | 0.07591 | 0.52600 | 4.143 | 1.0 | bEFNB3 |
| 5813 | 530 | Q | W | -0.9974 | 0.5473 | 3.0 | 0.01818 | 0.66470 | 3.286 | 1.0 | bEFNB3 |
| 6118 | 555 | D | A | -0.8295 | 0.1999 | 3.5 | 0.37550 | 0.23450 | 3.714 | 1.0 | bEFNB3 |
| 4852 | 441 | P | V | -0.7461 | 0.1815 | 5.5 | 0.37080 | 0.08612 | 5.286 | 1.0 | bEFNB3 |
These are the highest binding mutants detected with positive entry scores:
| site | wildtype | mutant | binding_mean | binding_std | times_seen_binding | effect | effect_std | times_seen_cell_entry | frac_models | selection | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 5462 | 492 | Q | L | 1.2610 | 0.06666 | 5.5 | 0.48330 | 0.1827 | 5.143 | 1.0 | bEFNB3 |
| 1708 | 211 | G | F | 1.1690 | 0.28900 | 5.0 | 0.16370 | 0.5716 | 5.714 | 1.0 | bEFNB3 |
| 6543 | 598 | P | G | 1.1540 | 1.55900 | 4.5 | 0.23870 | 0.5146 | 5.429 | 1.0 | bEFNB3 |
| 6098 | 553 | S | W | 0.9943 | 0.43520 | 9.5 | 0.16690 | 0.4060 | 9.857 | 1.0 | bEFNB3 |
| 6402 | 582 | D | W | 0.9761 | 0.33840 | 6.5 | 0.06342 | 0.3096 | 7.143 | 1.0 | bEFNB3 |
| site | binding_mean | |
|---|---|---|
| 490 | 589 | 11.707770 |
| 87 | 161 | 7.332800 |
| 469 | 564 | 6.721080 |
| 225 | 306 | 6.492740 |
| 59 | 132 | 6.168804 |
| 242 | 323 | 6.101950 |
| 470 | 566 | 5.968996 |
| 261 | 342 | 5.728920 |
| 129 | 204 | 5.653570 |
| 175 | 254 | 5.152500 |
Find the top overlapping binders for both bEFNB2 and bEFNB3¶
In [10]:
def find_good_binding_for_both(df):
e2_good = df.sort_values(by='binding_mean_E2',ascending=False)
e3_good = df.sort_values(by='binding_mean_E3',ascending=False)
e2_good = e2_good.head(50)
e3_good = e3_good.head(50)
combo = pd.merge(e2_good,e3_good,on=['site','wildtype','mutant'],how='inner')
display(combo)
find_good_binding_for_both(df_binding_effect_merge)
| site | wildtype | mutant | binding_mean_E2_x | binding_std_E2_x | times_seen_binding_E2_x | effect_E2_x | effect_std_E2_x | times_seen_cell_entry_E2_x | frac_models_E2_x | ... | effect_std_E2_y | times_seen_cell_entry_E2_y | frac_models_E2_y | binding_mean_E3_y | binding_std_E3_y | times_seen_binding_E3_y | effect_E3_y | effect_std_E3_y | times_seen_cell_entry_E3_y | frac_models_E3_y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 566 | F | C | 2.160 | 2.1190 | 5.000 | -0.5326 | 0.4547 | 5.500 | 0.75 | ... | 0.4547 | 5.500 | 0.75 | 1.1320 | 1.194000 | 3.5 | -0.8507 | 0.7599 | 3.833 | 1.0 |
| 1 | 580 | I | S | 2.152 | 1.7680 | 4.750 | -1.0010 | 0.6921 | 6.375 | 1.00 | ... | 0.6921 | 6.375 | 1.00 | 0.8187 | 0.030890 | 6.0 | 0.1163 | 0.3235 | 5.571 | 1.0 |
| 2 | 211 | G | F | 1.957 | 0.5488 | 4.750 | -0.6654 | 0.4262 | 5.125 | 1.00 | ... | 0.4262 | 5.125 | 1.00 | 1.1690 | 0.289000 | 5.0 | 0.1637 | 0.5716 | 5.714 | 1.0 |
| 3 | 553 | S | W | 1.945 | 0.2867 | 10.000 | -0.2967 | 0.4293 | 13.250 | 1.00 | ... | 0.4293 | 13.250 | 1.00 | 0.9943 | 0.435200 | 9.5 | 0.1669 | 0.4060 | 9.857 | 1.0 |
| 4 | 139 | N | W | 1.930 | 0.4834 | 5.750 | -0.6672 | 0.6217 | 8.000 | 1.00 | ... | 0.6217 | 8.000 | 1.00 | 1.0030 | 0.004575 | 4.5 | -0.4441 | 0.4452 | 6.286 | 1.0 |
| 5 | 589 | R | G | 1.668 | 0.9018 | 7.333 | -1.3800 | 0.4560 | 11.880 | 0.75 | ... | 0.4560 | 11.880 | 0.75 | 2.0060 | 0.659100 | 7.0 | -0.9655 | 0.5763 | 8.286 | 1.0 |
6 rows × 31 columns
In [11]:
# Compare E2 and E3 binders
def find_highest_lowest(df):
df["binding_diff"] = (df["binding_mean_E2"] - df["binding_mean_E3"]).abs()
print(
"These are the mutants with the biggest difference between EFNB2 and EFNB3:\n"
)
display(df.sort_values(by="binding_diff", ascending=False).head(10))
# calculate aggregate differences
agg_df = (
df.groupby("site")[["binding_mean_E2", "binding_mean_E3", "binding_diff"]]
.mean()
.reset_index()
)
print("These are the sites with the biggest difference between EFNB2 and EFNB3:\n")
display(agg_df.sort_values(by="binding_diff", ascending=False).head(5))
find_highest_lowest(df_binding_effect_merge)
# find_highest_lowest(df_E3_filter,'EFNB3')
These are the mutants with the biggest difference between EFNB2 and EFNB3:
| site | wildtype | mutant | binding_mean_E2 | binding_std_E2 | times_seen_binding_E2 | effect_E2 | effect_std_E2 | times_seen_cell_entry_E2 | frac_models_E2 | binding_mean_E3 | binding_std_E3 | times_seen_binding_E3 | effect_E3 | effect_std_E3 | times_seen_cell_entry_E3 | frac_models_E3 | binding_diff | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5835 | 530 | Q | F | -4.015 | 0.46650 | 6.25 | 0.46740 | 0.2023 | 5.375 | 1.0 | 0.084370 | 0.06198 | 6.0 | 0.3592 | 0.2738 | 6.286 | 1.0 | 4.099370 |
| 5362 | 490 | Q | Y | -4.290 | 0.90020 | 5.50 | -1.08800 | 0.4004 | 6.375 | 1.0 | -0.461000 | 0.57400 | 5.5 | -0.3137 | 0.7967 | 5.714 | 1.0 | 3.829000 |
| 5367 | 491 | S | H | -4.505 | 0.93500 | 8.50 | 0.11240 | 0.2502 | 10.250 | 1.0 | -0.871800 | 0.78980 | 7.0 | -1.3810 | 0.4865 | 8.286 | 1.0 | 3.633200 |
| 5366 | 491 | S | G | -4.181 | 0.59490 | 6.50 | 0.13020 | 0.2975 | 8.625 | 1.0 | -0.562100 | 0.04726 | 5.5 | -0.2358 | 0.6084 | 6.286 | 1.0 | 3.618900 |
| 5361 | 490 | Q | W | -4.205 | 1.00500 | 3.75 | 0.23460 | 0.3779 | 4.750 | 1.0 | -0.586500 | 0.32260 | 3.0 | -0.1506 | 0.4575 | 3.286 | 1.0 | 3.618500 |
| 6627 | 588 | I | P | -2.349 | 0.84050 | 5.25 | -0.25550 | 0.3004 | 4.750 | 1.0 | 1.183000 | 0.25530 | 5.5 | -0.6131 | 0.3819 | 5.429 | 1.0 | 3.532000 |
| 5385 | 492 | Q | K | -3.523 | 0.54860 | 11.00 | 0.35270 | 0.2320 | 11.620 | 1.0 | 0.006126 | 0.07740 | 10.0 | 0.3291 | 0.2404 | 11.570 | 1.0 | 3.529126 |
| 5389 | 492 | Q | R | -2.823 | 0.61110 | 19.25 | -0.07662 | 0.2715 | 23.120 | 1.0 | 0.643700 | 0.05388 | 20.0 | 0.5095 | 0.1192 | 20.570 | 1.0 | 3.466700 |
| 1864 | 239 | S | H | -3.440 | 0.08816 | 5.25 | 0.33410 | 0.2721 | 4.500 | 1.0 | -0.041160 | 0.83220 | 4.0 | -1.2190 | 0.6930 | 5.000 | 1.0 | 3.398840 |
| 5844 | 530 | Q | V | -4.327 | 0.83770 | 6.00 | 0.10570 | 0.2693 | 8.625 | 1.0 | -0.987000 | 0.89600 | 5.5 | -1.4430 | 0.7389 | 7.000 | 1.0 | 3.340000 |
These are the sites with the biggest difference between EFNB2 and EFNB3:
| site | binding_mean_E2 | binding_mean_E3 | binding_diff | |
|---|---|---|---|---|
| 416 | 505 | -3.790625 | -0.770900 | 2.897100 |
| 440 | 530 | -3.631593 | -0.559266 | 2.797646 |
| 406 | 491 | -3.575407 | -0.364667 | 2.785117 |
| 496 | 588 | -3.474250 | 0.037600 | 2.686600 |
| 408 | 494 | -3.915765 | -0.615000 | 2.411333 |
Make plots showing correlation between binding and entry for EFNB2 and EFNB3¶
In [12]:
def plot_corr_binding_entry_updated(df, flag):
# Define interactive selectors for variant selection.
# 'variant_selector' is for selecting individual variants based on 'site' and 'mutant'.
variant_selector = alt.selection_point(
on="mouseover", nearest=True, empty=False, fields=["site", "mutant"], value=0
)
# 'variant_selector_agg' is for aggregated selection based on 'site' only.
variant_selector_agg = alt.selection_point(
on="mouseover", nearest=True, empty=False, fields=["site"], value=0
)
# Initialize an empty list to store charts for each unique selection in 'df'.
empty_chart = []
# Loop through each unique cell selection in the DataFrame.
for cell in list(df["selection"].unique()):
# Filter DataFrame for the current selection.
tmp_df = df[df["selection"] == cell]
# Check if aggregation flag is True to aggregate data.
if flag == True:
# Aggregate data by 'site', summing 'binding_median' and 'effect', then reset index.
agg_df = (
tmp_df.groupby("site")[["binding_mean", "effect"]].mean().reset_index()
)
# Create a chart using aggregated data with specified encodings.
chart = (
alt.Chart(agg_df)
.mark_point(stroke="black", filled=True) # Use filled points with black stroke.
.encode(
x=alt.X(
"effect",
title=f"Mean entry in CHO-{cell}",
axis=alt.Axis(grid=True),
),
y=alt.Y(
"binding_mean",
title=f"Mean {cell} binding",
axis=alt.Axis(grid=True),
),
# Dynamic opacity, size, strokeWidth, and color based on 'variant_selector_agg'.
opacity=alt.condition(
variant_selector_agg, alt.value(1), alt.value(0.2)
),
size=alt.condition(
variant_selector_agg, alt.value(100), alt.value(50)
),
strokeWidth=alt.condition(
variant_selector_agg, alt.value(1), alt.value(0)
),
color=alt.condition(
variant_selector_agg, alt.value("orange"), alt.value("black")
),
tooltip=["site", "binding_mean", "effect"],
)
.add_params(variant_selector_agg)
)
# Add the chart to the list.
empty_chart.append(chart)
else:
# Create a chart for individual data points with specified encodings.
chart = (
alt.Chart(tmp_df)
.mark_point(stroke="black", filled=True)
.encode(
x=alt.X(
"effect", title=f"Entry in CHO-{cell}", axis=alt.Axis(grid=True)
),
y=alt.Y(
"binding_mean",
title=f"{cell} binding",
axis=alt.Axis(grid=True),
),
# Dynamic opacity, size, strokeWidth, and color based on 'variant_selector'.
opacity=alt.condition(
variant_selector, alt.value(1), alt.value(0.1)
),
size=alt.condition(variant_selector, alt.value(50), alt.value(20)),
strokeWidth=alt.condition(
variant_selector, alt.value(1), alt.value(0)
),
color=alt.condition(
variant_selector, alt.value("orange"), alt.value("black")
),
tooltip=[
"site",
"wildtype",
"mutant",
"binding_mean",
"times_seen_binding",
"effect",
],
)
.add_params(variant_selector)
)
# Add the chart to the list.
empty_chart.append(chart)
# Combine all charts horizontally with a title.
combined_chart = alt.hconcat(
*empty_chart, title=alt.Title("Correlation between binding and entry")
)
# Return the combined chart for display or further use.
return combined_chart
# Generate and display plots for non-aggregated data.
entry_binding_corr_plot = plot_corr_binding_entry_updated(df_binding_effect_concat, False)
entry_binding_corr_plot.display()
# Save the plot
if entry_binding_combined_corr_plot is not None:
entry_binding_corr_plot.save(entry_binding_combined_corr_plot)
# Repeat for aggregated data.
entry_binding_corr_plot_agg = plot_corr_binding_entry_updated(df_binding_effect_concat, True)
entry_binding_corr_plot_agg.display()
# Save the aggregated plot
if entry_binding_combined_corr_plot is not None:
entry_binding_corr_plot_agg.save(entry_binding_combined_corr_plot_agg)
Same plot as above, but slightly different format¶
In [13]:
def plot_entry_binding_corr_heatmap(df):
empty_chart = []
for cell in list(df["selection"].unique()):
tmp_df = df[df["selection"] == cell]
chart = (
alt.Chart(tmp_df, title=f"{cell}")
.mark_rect()
.encode(
x=alt.X(
"effect", title="Cell entry", axis=alt.Axis(values=[-2, -1, 0, 1])
).bin(maxbins=50),
y=alt.Y(
"binding_mean",
title="Receptor binding",
axis=alt.Axis(values=[-4, -2, 0, 2]),
).bin(maxbins=50),
color=alt.Color("count()", title="Count").scale(type='log'),
)
).properties(width=200,height=200)
empty_chart.append(chart)
combined_chart = alt.hconcat(
*empty_chart,
).resolve_scale(y="shared", x="shared", color="shared")
return combined_chart
entry_binding_corr_heat = plot_entry_binding_corr_heatmap(df_binding_effect_concat)
entry_binding_corr_heat.display()
if entry_binding_combined_corr_plot is not None:
entry_binding_corr_heat.save(entry_binding_corr_heatmap)
Calculate some stats on binding¶
In [14]:
def overall_stats(df, effect, name):
# Now group sites and find sites where all mutants are deleterious
filtered_df = df.groupby("site").filter(lambda group: (group[effect] < -0.25).all())
# Which sites are these?
unique = filtered_df["site"].unique()
# Convert unique to a Pandas Series
unique_series = pd.Series(unique)
# Find the common elements that are also contact sites
unique_contact_bool = unique_series.isin(config["contact_sites"])
# Filter and get the common elements
common_elements = unique_series[unique_contact_bool]
print(f"The dataset we are analyzing is: {name}\n")
# Print the common elements
print(
f"Here are the contact sites that only have negative binding scores: {list(common_elements)}\n"
)
print(f"There are {len(unique)} sites with all negative binding score mutants\n")
print(
f"These are the sites with all negative binding score mutants: {list(unique)}\n"
)
# Now find sites with low and high binding (median)
median_df = (
df.groupby("site")["binding_mean"]
.max()
.reset_index()
.sort_values(by="binding_mean", ascending=False)
)
print("These are the sites with the highest binding mutants:\n")
display(median_df.head(5))
# Now calculate mutant number
total_mutants = df.shape[0]
mutants_above_cutoff_tolerated = df[df["effect"] > 0]
mutants_above_cutoff_tolerated = mutants_above_cutoff_tolerated[
["site", "effect", "binding_mean", "wildtype", "mutant"]
]
total_sites = df["site"].unique().shape[0]
print(f"The total number of sites are: {total_sites}")
overall_stats(df_E2_filter, "binding_mean", "EFNB2")
overall_stats(df_E3_filter, "binding_mean", "EFNB3")
The dataset we are analyzing is: EFNB2 Here are the contact sites that only have negative binding scores: [242, 389, 488, 490, 491, 501, 504, 505, 531, 532, 533, 557, 579, 581, 588] There are 41 sites with all negative binding score mutants These are the sites with all negative binding score mutants: [100, 116, 220, 236, 238, 242, 243, 248, 346, 351, 352, 389, 390, 400, 435, 438, 441, 460, 467, 486, 487, 488, 490, 491, 494, 495, 497, 501, 504, 505, 526, 531, 532, 533, 557, 579, 581, 584, 585, 588, 590] These are the sites with the highest binding mutants:
| site | binding_mean | |
|---|---|---|
| 474 | 566 | 2.160 |
| 487 | 580 | 2.152 |
| 250 | 331 | 2.144 |
| 188 | 269 | 2.075 |
| 493 | 586 | 1.962 |
The total number of sites are: 510 The dataset we are analyzing is: EFNB3 Here are the contact sites that only have negative binding scores: [389, 488, 501, 505, 531, 532] There are 15 sites with all negative binding score mutants These are the sites with all negative binding score mutants: [108, 352, 389, 467, 486, 488, 494, 495, 497, 501, 505, 510, 531, 532, 584] These are the sites with the highest binding mutants:
| site | binding_mean | |
|---|---|---|
| 490 | 589 | 2.006 |
| 482 | 580 | 1.337 |
| 189 | 270 | 1.329 |
| 186 | 267 | 1.291 |
| 405 | 492 | 1.261 |
The total number of sites are: 504
Find sites with opposite effects on binding¶
In [15]:
# find sites that are different
def find_biggest_differences(df):
efnb2_good_efnb3_bad = df[
(df["binding_mean_E2"] > 0.5) & (df["binding_mean_E3"] < -0.5)
].sort_values(by="binding_mean_E2", ascending=False)
display(efnb2_good_efnb3_bad)
efnb2_bad_efnb3_good = df[
(df["binding_mean_E2"] < -0.5) & (df["binding_mean_E3"] > 0.5)
].sort_values(by="binding_mean_E3", ascending=False)
display(efnb2_bad_efnb3_good)
find_biggest_differences(df_binding_effect_merge)
| site | wildtype | mutant | binding_mean_E2 | binding_std_E2 | times_seen_binding_E2 | effect_E2 | effect_std_E2 | times_seen_cell_entry_E2 | frac_models_E2 | binding_mean_E3 | binding_std_E3 | times_seen_binding_E3 | effect_E3 | effect_std_E3 | times_seen_cell_entry_E3 | frac_models_E3 | binding_diff | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 6041 | 546 | L | H | 1.5230 | 1.1450 | 5.25 | -1.1480 | 0.3866 | 6.750 | 1.0 | -0.5431 | 0.6654 | 3.5 | -0.85080 | 0.5440 | 5.143 | 1.0 | 2.0661 |
| 2620 | 303 | P | C | 1.3450 | 1.5770 | 5.00 | -0.7175 | 0.5900 | 5.000 | 1.0 | -0.5659 | 0.0975 | 4.0 | -0.69680 | 0.5942 | 4.857 | 1.0 | 1.9109 |
| 122 | 79 | N | S | 1.2920 | 1.6520 | 3.00 | -0.6352 | 0.4964 | 2.375 | 1.0 | -0.6464 | 0.2595 | 3.0 | -0.12940 | 0.5688 | 3.429 | 1.0 | 1.9384 |
| 103 | 78 | D | I | 0.6586 | 0.6340 | 5.25 | -0.9306 | 0.3975 | 6.500 | 1.0 | -0.5665 | 0.7552 | 5.0 | -0.51670 | 0.8447 | 5.143 | 1.0 | 1.2251 |
| 1681 | 224 | M | Q | 0.6473 | 0.8796 | 3.50 | 0.1659 | 0.3148 | 2.750 | 1.0 | -0.5806 | 0.3813 | 3.5 | 0.20550 | 0.2920 | 4.286 | 1.0 | 1.2279 |
| 5707 | 520 | I | G | 0.5914 | 1.0130 | 5.00 | -1.4650 | 0.3487 | 8.000 | 1.0 | -0.5305 | 0.1476 | 5.5 | -1.30900 | 0.3187 | 7.000 | 1.0 | 1.1219 |
| 1031 | 178 | V | T | 0.5288 | 0.7257 | 3.75 | -0.5276 | 0.7591 | 4.250 | 1.0 | -0.5130 | 0.5995 | 3.0 | -0.01921 | 0.4254 | 3.571 | 1.0 | 1.0418 |
| site | wildtype | mutant | binding_mean_E2 | binding_std_E2 | times_seen_binding_E2 | effect_E2 | effect_std_E2 | times_seen_cell_entry_E2 | frac_models_E2 | binding_mean_E3 | binding_std_E3 | times_seen_binding_E3 | effect_E3 | effect_std_E3 | times_seen_cell_entry_E3 | frac_models_E3 | binding_diff | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 6627 | 588 | I | P | -2.3490 | 0.8405 | 5.25 | -0.25550 | 0.3004 | 4.750 | 1.0 | 1.1830 | 0.255300 | 5.5 | -0.6131 | 0.3819 | 5.429 | 1.0 | 3.5320 |
| 6733 | 598 | P | G | -0.5523 | 0.7852 | 5.00 | -0.86740 | 0.9393 | 4.625 | 1.0 | 1.1540 | 1.559000 | 4.5 | 0.2387 | 0.5146 | 5.429 | 1.0 | 1.7063 |
| 5236 | 479 | W | S | -0.8017 | 1.2720 | 3.25 | -1.30400 | 0.8784 | 5.250 | 1.0 | 0.7307 | 2.957000 | 2.0 | -1.4690 | 0.6120 | 3.286 | 1.0 | 1.5324 |
| 3179 | 338 | R | G | -0.5172 | 0.5068 | 4.50 | 0.29960 | 0.4223 | 5.250 | 1.0 | 0.6836 | 0.975700 | 4.0 | 0.1580 | 0.3395 | 3.429 | 1.0 | 1.2008 |
| 5363 | 491 | S | A | -0.9317 | 0.5371 | 5.50 | -0.66860 | 0.8056 | 6.250 | 1.0 | 0.6451 | 0.009846 | 6.5 | 0.1429 | 0.3582 | 6.143 | 1.0 | 1.5768 |
| 5389 | 492 | Q | R | -2.8230 | 0.6111 | 19.25 | -0.07662 | 0.2715 | 23.120 | 1.0 | 0.6437 | 0.053880 | 20.0 | 0.5095 | 0.1192 | 20.570 | 1.0 | 3.4667 |
Find correlations between EFNB2 and EFNB3 binding¶
In [16]:
def plot_entry_binding_corr(df):
chart = (
alt.Chart(df, title="Correlation Between Mutant Binding Scores")
.mark_rect()
.encode(
x=alt.X(
"binding_mean_E2",
title="bEFNB2 binding",
axis=alt.Axis(values=[-4,-2, 0, 2]),
).bin(maxbins=50),
y=alt.Y(
"binding_mean_E3",
title="bEFNB3 binding",
axis=alt.Axis(values=[-2, 0, 2]),
).bin(maxbins=50),
color=alt.Color("count()", title="Count").scale(type='log'),
)
).properties(width=200,height=200)
return chart
entry_binding_corr_heatmap_1 = plot_entry_binding_corr(df_binding_effect_merge)
entry_binding_corr_heatmap_1.display()
if entry_binding_combined_corr_plot is not None:
entry_binding_corr_heatmap_1.save(binding_corr_heatmap)
In [17]:
def plot_affinity_solid(df):
df = df.dropna()
# calculate r value
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
df["binding_mean_E2"], df["binding_mean_E3"]
)
r_value = float(r_value)
# make chart
chart = (
alt.Chart(
df,
title=alt.Title(
"Correlation Between Mutant Binding Scores"
),
)
.mark_point(color="black", size=30, opacity=0.2, filled=True)
.encode(
x=alt.X("binding_mean_E2", title=("bEFNB2 binding")),
y=alt.Y("binding_mean_E3", title=("bEFNB3 binding")),
tooltip=[
"site",
"wildtype",
"mutant",
"binding_mean_E2",
"binding_mean_E3",
"effect_E2",
"effect_E3",
],
)
)
highlight = chart.transform_filter(
(alt.datum.site == 458) & (alt.datum.mutant == 'Y') | # Example condition
(alt.datum.site == 507) & (alt.datum.mutant == 'I') #|
#(alt.datum.site == 589) & (alt.datum.mutant == 'G') |
#(alt.datum.site == 580) & (alt.datum.mutant == 'S') |
#(alt.datum.site == 492) & (alt.datum.mutant == 'L') |
#(alt.datum.site == 492) & (alt.datum.mutant == 'R') |
#(alt.datum.site == 566) & (alt.datum.mutant == 'C') |
#(alt.datum.site == 211) & (alt.datum.mutant == 'F') |
#(alt.datum.site == 553) & (alt.datum.mutant == 'W') |
#(alt.datum.site == 546) & (alt.datum.mutant == 'H') |
#(alt.datum.site == 555) & (alt.datum.mutant == 'K')
).mark_point(
color="black", size=40, opacity=1, filled=True, stroke='orange', strokeWidth=1
)
min = int(df["binding_mean_E2"].min())
max = int(df["binding_mean_E3"].max())
text = (
alt.Chart({"values": [{"x": min, "y": max, "text": f"r = {r_value:.2f}"}]})
.mark_text(align="left", baseline="top", dx=-10, dy=-20)
.encode(x=alt.X("x:Q"), y=alt.Y("y:Q"), text="text:N")
)
chart_and_text = chart + text+ highlight
return chart_and_text
E2_E3_corr = plot_affinity_solid(df_binding_effect_merge)
E2_E3_corr.display()
if entry_binding_combined_corr_plot is not None:
E2_E3_corr.save(E2_E3_correlation)
Plot correlations between summary statistics for each site¶
In [18]:
def plot_affinity_solid_mean(df):
df = df.dropna()
means = (
df.groupby("site")
.agg(
{
"effect_E2": "mean",
"effect_E3": "mean",
"binding_mean_E2": "mean",
"binding_mean_E3": "mean",
"wildtype": "first",
}
)
.reset_index()
)
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
means["binding_mean_E2"], means["binding_mean_E3"]
)
r_value = float(r_value)
chart = (
alt.Chart(
means,
title=alt.Title(
"Correlation between Aggregate Mutant Binding Scores",
subtitle=f"r={r_value:.2f}",
),
)
.mark_point(size=50, opacity=0.3,filled=True,color='black')
.encode(
x=alt.X(
"binding_mean_E2",
title=("Avg EFNB2 Binding"),
axis=alt.Axis(tickCount=3),
),
y=alt.Y(
"binding_mean_E3",
title=("Avg EFNB3 Binding"),
axis=alt.Axis(tickCount=3),
),
tooltip=[
"site",
"wildtype",
"binding_mean_E2",
"binding_mean_E3",
"effect_E2",
"effect_E3",
],
)
)
text = (
alt.Chart({"values": [{"x": -3.5, "y": 0.5, "text": f"r = {r_value:.2f}"}]})
.mark_text(align="left", baseline="top", dx=0, dy=0)
.encode(x=alt.X("x:Q"), y=alt.Y("y:Q"), text="text:N")
)
chart_and_text = chart
return chart_and_text
E2_E3_site_corr = plot_affinity_solid_mean(df_binding_effect_merge)
E2_E3_site_corr.display()
if entry_binding_combined_corr_plot is not None:
E2_E3_site_corr.save(E2_E3_correlation_site)
if entry_binding_combined_corr_plot is not None:
(E2_E3_corr | E2_E3_site_corr).save(combined_E2_E3_site_corr)
Make plot showing binding by site (median)¶
In [19]:
def plot_affinity_by_site_median(df):
variant_selector = alt.selection_point(
on="mouseover", nearest=True, empty=False, fields=["site"], value=0
)
empty_charts = []
for selection in ["binding_mean_E2", "binding_mean_E3"]:
if selection == "binding_mean_E2":
name = "bEFNB2 binding"
else:
name = "bEFNB3 binding"
mean = df.groupby("site")[selection].max().reset_index()
mean = mean[mean[selection] >= 0]
chart = (
alt.Chart(mean)
.mark_bar(stroke="black",size=2)
.encode(
x=alt.X(
"site",
title=("Site"),
axis=alt.Axis(grid=True, tickCount=4),
scale=alt.Scale(domain=[70, 602]),
),
y=alt.Y(selection, title=(name), axis=alt.Axis(grid=True, tickCount=3)),
tooltip=["site"],
color=alt.condition(
variant_selector, alt.value("orange"), alt.value("black")
),
opacity=alt.condition(variant_selector, alt.value(1), alt.value(0.5)),
strokeWidth=alt.condition(variant_selector, alt.value(1), alt.value(0)),
)
.properties(height=150, width=800)
.add_params(variant_selector)
)
empty_charts.append(chart)
combined_chart = alt.vconcat(*empty_charts, spacing=1, title="Max Binding by Site")
return combined_chart
binding_by_site = plot_affinity_by_site_median(df_binding_effect_merge)
binding_by_site.display()
if entry_binding_combined_corr_plot is not None:
binding_by_site.save(binding_by_site_plot)
In [20]:
def plot_affinity_by_contact_site(df, sites_to_show, title_text):
variant_selector = alt.selection_point(
on="mouseover", nearest=True, empty=False, fields=["site"], value=0
)
empty_charts = []
contact_df = df[df["site"].isin(sites_to_show)]
sites = list(contact_df["site"].unique())
for selection in df["selection"].unique():
tmp_df = contact_df[contact_df["selection"] == selection]
mean = tmp_df.groupby("site")["binding_mean"].mean().reset_index()
chart = (
alt.Chart(mean)
.mark_point(size=100,filled=True,stroke='black')
.encode(
x=alt.X(
"site:N",
sort='-y',
title=("Site"),
axis=alt.Axis(grid=True, labelAngle=-90),
scale=alt.Scale(domain=sites),
),
y=alt.Y(
"binding_mean", title=(f"{selection}"), axis=alt.Axis(grid=True)
),
tooltip=["site"],
color=alt.condition(
variant_selector, alt.value("orange"), alt.value("black")
),
strokeWidth=alt.condition(variant_selector, alt.value(2), alt.value(0)),
)
.add_params(variant_selector)
)
empty_charts.append(chart)
combined_chart = alt.vconcat(*empty_charts, spacing=1, title=title_text)
return combined_chart
contact_binding_by_site = plot_affinity_by_contact_site(
df_binding_effect_concat, config["contact_sites"], "Max Binding in Contact"
)
contact_binding_by_site.display()
if entry_binding_combined_corr_plot is not None:
contact_binding_by_site.save(max_binding_in_contact)
contact_binding_by_site_stalk = plot_affinity_by_contact_site(
df_binding_effect_concat, list(range(96, 147)), "Max Binding in Stalk"
)
contact_binding_by_site_stalk.display()
if entry_binding_combined_corr_plot is not None:
contact_binding_by_site_stalk.save(max_binding_in_stalk)
In [21]:
display(df_binding_effect_concat.head(3))
| site | wildtype | mutant | binding_mean | binding_std | times_seen_binding | effect | effect_std | times_seen_cell_entry | frac_models | selection | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 71 | Q | D | -0.7817 | 1.6460 | 3.25 | -1.164 | 0.8890 | 4.500 | 1.0 | bEFNB2 |
| 1 | 71 | Q | E | 0.1659 | 0.4309 | 3.50 | -1.255 | 0.3123 | 5.375 | 1.0 | bEFNB2 |
| 2 | 71 | Q | F | -0.3429 | 0.5299 | 3.00 | -1.058 | 0.6637 | 4.625 | 1.0 | bEFNB2 |
In [22]:
def plot_affinity_by_contact_site(df, sites_to_show,name):
# Filter the DataFrame based on the sites to show
contact_df = df[df["site"].isin(sites_to_show)]
# Aggregate and sort the DataFrame
aggregated_df = contact_df.groupby('site')['binding_mean'].max().reset_index()
aggregated_df = aggregated_df.sort_values(by='binding_mean', ascending=False)
sorted_sites = aggregated_df['site'].tolist()
# Define a selection for highlighting bars on hover
variant_selector = alt.selection_point(on="mouseover", nearest=True, empty=False, fields=["site"])
# Create the chart
chart = alt.Chart(aggregated_df,title=f'{name}').mark_bar(stroke='black').encode(
y=alt.Y("site:N", title="Site", scale=alt.Scale(domain=sorted_sites)),
x=alt.X("max(binding_mean):Q", title="Max binding mutant at site"),
color=alt.condition(variant_selector, alt.value("orange"), alt.value("black")),
strokeWidth=alt.condition(variant_selector, alt.value(2), alt.value(0)),
).add_params(variant_selector).properties(width=200, height=alt.Step(12))
return chart
In [23]:
contact_binding_by_site = plot_affinity_by_contact_site(df_E2_filter, config["contact_sites"],'bEFNB2')
contact_binding_by_site.display()
contact_binding_by_site_e3 = plot_affinity_by_contact_site(df_E3_filter, config["contact_sites"],'bEFNB3')
contact_binding_by_site_e3.display()
In [24]:
combined_contact_ranked_bar = alt.hconcat(contact_binding_by_site,contact_binding_by_site_e3).properties(title='Binding at receptor contact sites')
combined_contact_ranked_bar.display()
combined_contact_ranked_bar.save(combined_contact_ranked_bar_output)
Make bubble plots for binding in different areas of receptor pocket¶
In [25]:
def make_boxplot_binding_region(
df, title
): # Create a box plot using Altair for aggregated means
barrel_ranges = {
"Hydrophobic": config["hydrophobic"],
"Salt Bridges": config["salt_bridges"],
"Hydrogen Bonds": config["h_bond_total"],
"Contact": config["contact_sites"],
"Overall": list(range(71, 602)),
}
mean_df = df.groupby("site")[["binding_mean"]].median().reset_index()
custom_order = [
"Hydrophobic",
"Salt Bridges",
"Hydrogen Bonds",
"Contact",
"Overall",
]
agg_means = []
# For each barrel, filter the site_means dataframe to the sites belonging to that barrel and then store the means
for barrel, sites in barrel_ranges.items():
subset = mean_df[mean_df["site"].isin(sites)]
for _, row in subset.iterrows():
agg_means.append(
{"barrel": barrel, "effect": row["binding_mean"], "site": row["site"]}
)
agg_means_df = pd.DataFrame(agg_means)
chart = (
alt.Chart(agg_means_df)
.mark_point(size=50, opacity=0.4)
.encode(
x=alt.X(
"barrel:O", sort=custom_order, title=None, axis=alt.Axis(labelAngle=-90)
),
y=alt.Y(
"effect",
title=f"Median {title} Binding",
axis=alt.Axis(grid=True, tickCount=4),
),
xOffset="random:Q",
tooltip=["barrel", "effect", "site"],
)
.transform_calculate(random="sqrt(-1*log(random()))*cos(2*PI*random())")
)
return chart.display()
make_boxplot_binding_region(df_E2_filter, "EFNB2")
make_boxplot_binding_region(df_E3_filter, "EFNB3")
make boxplot of binding scores by region¶
In [26]:
def make_boxplot_binding_region(df):
barrel_ranges = {
"Stalk": list(range(96, 147)),
"Neck": list(range(148, 165)),
"Linker": list(range(166, 177)),
"Head": list(range(178, 602)),
"Receptor Contact": config["contact_sites"],
"Total": list(range(71, 602)),
}
custom_order = ["Stalk", "Neck", "Linker", "Head", "Receptor Contact", "Total"]
empty_charts = []
for selection in df["selection"].unique():
tmp_df = df[df["selection"] == selection]
agg_means = []
# For each barrel, filter the site_means dataframe to the sites belonging to that barrel and then store the means
for barrel, sites in barrel_ranges.items():
subset = tmp_df[tmp_df["site"].isin(sites)]
for _, row in subset.iterrows():
agg_means.append(
{
"region": barrel,
"binding_mean": row["binding_mean"],
"site": row["site"],
}
)
agg_means_df = pd.DataFrame(agg_means)
chart = (
alt.Chart(agg_means_df, title=f"{selection}")
.mark_boxplot(color="darkgray", extent="min-max", opacity=1)
.encode(
x=alt.X(
"region:O",
sort=custom_order,
title="RBP Region",
axis=alt.Axis(labelAngle=-90),
),
y=alt.Y(
"binding_mean",
title=f"Binding",
axis=alt.Axis(grid=True, tickCount=4),
),
tooltip=["region", "binding_mean", "site"],
)
.properties(width=config["width"], height=config["height"])
)
empty_charts.append(chart)
combined_effect_chart = alt.hconcat(*empty_charts).resolve_scale(
y="shared", x="shared", color="independent"
)
return combined_effect_chart
entry_region_boxplot = make_boxplot_binding_region(df_binding_effect_concat)
entry_region_boxplot.display()
if entry_binding_combined_corr_plot is not None:
entry_region_boxplot.save(binding_region_boxplot_plot)
In [27]:
def make_bubble_binding_region(df):
barrel_ranges = {
"Stalk": list(range(96, 147)),
"Neck": list(range(148, 165)),
"Linker": list(range(166, 177)),
"Head": list(range(178, 602)),
"Receptor Contact": config["contact_sites"],
#"Total": list(range(71, 602)),
}
custom_order = ["Stalk", "Neck", "Linker", "Head", "Receptor Contact"]
empty_charts = []
for selection in df["selection"].unique():
tmp_df = df[df["selection"] == selection]
agg_means = []
# For each barrel, filter the site_means dataframe to the sites belonging to that barrel and then store the means
for barrel, sites in barrel_ranges.items():
subset = tmp_df[tmp_df["site"].isin(sites)]
for _, row in subset.iterrows():
agg_means.append(
{
"region": barrel,
"binding_mean": row["binding_mean"],
"site": row["site"],
"mutant": row["mutant"],
}
)
agg_means_df = pd.DataFrame(agg_means)
variant_selector = alt.selection_point(
on="mouseover", empty=False, fields=["site", "mutant"], value=1
)
chart = (
alt.Chart(agg_means_df, title=f"{selection}")
.mark_point(opacity=0.3, stroke="black",filled=True)
.encode(
x=alt.X(
"region:O",
sort=custom_order,
title="RBP Region",
axis=alt.Axis(labelAngle=-90),
),
y=alt.Y(
"binding_mean",
title=f"Binding",
axis=alt.Axis(grid=True, tickCount=4),
),
xOffset="random:Q",
tooltip=["region", "binding_mean", "site", "mutant"],
color=alt.condition(
variant_selector, alt.value("orange"), alt.value("black")
),
opacity=alt.condition(variant_selector, alt.value(1), alt.value(0.1)),
strokeWidth=alt.condition(variant_selector, alt.value(2), alt.value(0)),
size=alt.condition(variant_selector, alt.value(50), alt.value(15)),
)
.transform_calculate(random="sqrt(-1*log(random()))*cos(2*PI*random())")
.properties(width=config["width"], height=config["height"])
).add_params(variant_selector)
empty_charts.append(chart)
combined_effect_chart = (
alt.hconcat(*empty_charts)
.resolve_scale(y="shared", x="shared", color="independent")
.add_params(variant_selector)
)
return combined_effect_chart
entry_region_bubble = make_bubble_binding_region(df_binding_effect_concat)
entry_region_bubble.display()
if entry_binding_combined_corr_plot is not None:
entry_region_bubble.save(binding_region_bubble_plot)
In [ ]: